Lab 6

Author

Mel Balcarcel Arias

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
✔ broom        1.0.7     ✔ rsample      1.2.1
✔ dials        1.4.0     ✔ tune         1.3.0
✔ infer        1.0.7     ✔ workflows    1.2.0
✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
✔ parsnip      1.3.1     ✔ yardstick    1.3.2
✔ recipes      1.2.0     
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
library(powerjoin)
library(glue)
library(vip)

Attaching package: 'vip'

The following object is masked from 'package:utils':

    vi
library(baguette)
library(ggthemes)
library(patchwork)
library(xgboost)

Attaching package: 'xgboost'

The following object is masked from 'package:dplyr':

    slice
library(parsnip)
library(workflows)
library(caret)
Loading required package: lattice

Attaching package: 'caret'

The following objects are masked from 'package:yardstick':

    precision, recall, sensitivity, specificity

The following object is masked from 'package:purrr':

    lift
library(tune)
library(kernlab)

Attaching package: 'kernlab'

The following object is masked from 'package:dials':

    buffer

The following object is masked from 'package:scales':

    alpha

The following object is masked from 'package:purrr':

    cross

The following object is masked from 'package:ggplot2':

    alpha

Question 1

root  <- 'https://gdex.ucar.edu/dataset/camels/file'
download.file('https://gdex.ucar.edu/dataset/camels/file/camels_attributes_v2.0.pdf', 
              'data/camels_attributes_v2.0.pdf')
# Question 1: 
types <- c("clim", "geol", "soil", "topo", "vege", "hydro")
# Where the files live online ...
remote_files  <- glue('{root}/camels_{types}.txt')
# where we want to download the data ...
local_files   <- glue('data/camels_{types}.txt')
walk2(remote_files, local_files, download.file, quiet = TRUE)
# Read and merge data
camels <- map(local_files, read_delim, show_col_types = FALSE) 
camels <- power_full_join(camels ,by = 'gauge_id')
# Question 2 
# zero_q_freq means frequency of days with Q = mm/day in percentage

Question 2

ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
  borders("state", colour = "gray50") +
  geom_point(aes(color = q_mean)) +
  scale_color_gradient(low = "pink", high = "dodgerblue") +
  ggthemes::theme_map()

p1 <- ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
  borders("state", colour = "gray50") +
  geom_point(aes(color = aridity)) +
  scale_color_gradient(low = "blue", high = "yellow") +
  ggthemes::theme_map() +
  labs(title = "Aridity (PET to Precipitation Ratio)")

p2 <- ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
  borders("state", colour = "gray50") +
  geom_point(aes(color = p_mean)) +
  scale_color_gradient(low = "red", high = "dodgerblue") +
  ggthemes::theme_map() +
  labs(title = "Mean Daily Precipitation")

p1 + p2

Model Preperation

camels |> 
  select(aridity, p_mean, q_mean) |> 
  drop_na() |> 
  cor()
           aridity     p_mean     q_mean
aridity  1.0000000 -0.7550090 -0.5817771
p_mean  -0.7550090  1.0000000  0.8865757
q_mean  -0.5817771  0.8865757  1.0000000

Visual EDA

# Create a scatter plot of aridity vs rainfall
ggplot(camels, aes(x = aridity, y = p_mean)) +
  # Add points colored by mean flow
  geom_point(aes(color = q_mean)) +
  # Add a linear regression line
  geom_smooth(method = "lm", color = "red", linetype = 2) +
  # Apply the viridis color scale
  scale_color_viridis_c() +
  # Add a title, axis labels, and theme (w/ legend on the bottom)
  theme_linedraw() + 
  theme(legend.position = "bottom") + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow")
`geom_smooth()` using formula = 'y ~ x'

ggplot(camels, aes(x = aridity, y = p_mean)) +
  geom_point(aes(color = q_mean)) +
  geom_smooth(method = "lm") +
  scale_color_viridis_c() +
  # Apply log transformations to the x and y axes
  scale_x_log10() + 
  scale_y_log10() +
  theme_linedraw() +
  theme(legend.position = "bottom") + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow")
`geom_smooth()` using formula = 'y ~ x'

ggplot(camels, aes(x = aridity, y = p_mean)) +
  geom_point(aes(color = q_mean)) +
  geom_smooth(method = "lm") +
  # Apply a log transformation to the color scale
  scale_color_viridis_c(trans = "log") +
  scale_x_log10() + 
  scale_y_log10() +
  theme_linedraw() +
  theme(legend.position = "bottom",
        # Expand the legend width ...
        legend.key.width = unit(2.5, "cm"),
        legend.key.height = unit(.5, "cm")) + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow") 
`geom_smooth()` using formula = 'y ~ x'

Model Building

set.seed(123)
# Bad form to perform simple transformations on the outcome variable within a 
# recipe. So, we'll do it here.
camels <- camels |> 
  mutate(logQmean = log(q_mean))

# Generate the split
camels_split <- initial_split(camels, prop = 0.8)
camels_train <- training(camels_split)
camels_test  <- testing(camels_split)

camels_cv <- vfold_cv(camels_train, v = 10)
# Create a recipe to preprocess the data
rec <-  recipe(logQmean ~ aridity + p_mean, data = camels_train) %>%
  # Log transform the predictor variables (aridity and p_mean)
  step_log(all_predictors()) %>%
  # Add an interaction term between aridity and p_mean
  step_interact(terms = ~ aridity:p_mean) |> 
  # Drop any rows with missing values in the pred
  step_naomit(all_predictors(), all_outcomes())
# Prepare the data
baked_data <- prep(rec, camels_train) |> 
  bake(new_data = NULL)

# Interaction with lm
#  Base lm sets interaction terms with the * symbol
lm_base <- lm(logQmean ~ aridity * p_mean, data = baked_data)
summary(lm_base)

Call:
lm(formula = logQmean ~ aridity * p_mean, data = baked_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.91162 -0.21601 -0.00716  0.21230  2.85706 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -1.77586    0.16365 -10.852  < 2e-16 ***
aridity        -0.88397    0.16145  -5.475 6.75e-08 ***
p_mean          1.48438    0.15511   9.570  < 2e-16 ***
aridity:p_mean  0.10484    0.07198   1.457    0.146    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared:  0.7697,    Adjusted R-squared:  0.7684 
F-statistic: 591.6 on 3 and 531 DF,  p-value: < 2.2e-16
# Sanity Interaction term from recipe ... these should be equal!!
summary(lm(logQmean ~ aridity + p_mean + aridity_x_p_mean, data = baked_data))

Call:
lm(formula = logQmean ~ aridity + p_mean + aridity_x_p_mean, 
    data = baked_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.91162 -0.21601 -0.00716  0.21230  2.85706 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      -1.77586    0.16365 -10.852  < 2e-16 ***
aridity          -0.88397    0.16145  -5.475 6.75e-08 ***
p_mean            1.48438    0.15511   9.570  < 2e-16 ***
aridity_x_p_mean  0.10484    0.07198   1.457    0.146    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared:  0.7697,    Adjusted R-squared:  0.7684 
F-statistic: 591.6 on 3 and 531 DF,  p-value: < 2.2e-16
test_data <-  bake(prep(rec), new_data = camels_test)
test_data$lm_pred <- predict(lm_base, newdata = test_data)
metrics(test_data, truth = logQmean, estimate = lm_pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.583
2 rsq     standard       0.742
3 mae     standard       0.390
ggplot(test_data, aes(x = logQmean, y = lm_pred, colour = aridity)) +
  # Apply a gradient color scale
  scale_color_gradient2(low = "brown", mid = "orange", high = "darkgreen") +
  geom_point() +
  geom_abline(linetype = 2) +
  theme_linedraw() + 
  labs(title = "Linear Model: Observed vs Predicted",
       x = "Observed Log Mean Flow",
       y = "Predicted Log Mean Flow",
       color = "Aridity")

Using a workflow

# Define model
lm_model <- linear_reg() %>%
  # define the engine
  set_engine("lm") %>%
  # define the mode
  set_mode("regression")

# Instantiate a workflow ...
lm_wf <- workflow() %>%
  # Add the recipe
  add_recipe(rec) %>%
  # Add the model
  add_model(lm_model) %>%
  # Fit the model to the training data
  fit(data = camels_train) 

# Extract the model coefficients from the workflow
summary(extract_fit_engine(lm_wf))$coefficients
                   Estimate Std. Error    t value     Pr(>|t|)
(Intercept)      -1.7758557 0.16364755 -10.851710 6.463654e-25
aridity          -0.8839738 0.16144589  -5.475357 6.745512e-08
p_mean            1.4843771 0.15511117   9.569762 4.022500e-20
aridity_x_p_mean  0.1048449 0.07198145   1.456555 1.458304e-01
# From the base implementation
summary(lm_base)$coefficients
                 Estimate Std. Error    t value     Pr(>|t|)
(Intercept)    -1.7758557 0.16364755 -10.851710 6.463654e-25
aridity        -0.8839738 0.16144589  -5.475357 6.745512e-08
p_mean          1.4843771 0.15511117   9.569762 4.022500e-20
aridity:p_mean  0.1048449 0.07198145   1.456555 1.458304e-01
#
lm_data <- augment(lm_wf, new_data = camels_test)
dim(lm_data)
[1] 135  61
metrics(lm_data, truth = logQmean, estimate = .pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.583
2 rsq     standard       0.742
3 mae     standard       0.390
ggplot(lm_data, aes(x = logQmean, y = .pred, colour = aridity)) +
  scale_color_viridis_c() +
  geom_point() +
  geom_abline() +
  theme_linedraw()

Switch it up

library(baguette)
rf_model <- rand_forest() %>%
  set_engine("ranger", importance = "impurity") %>%
  set_mode("regression")

rf_wf <- workflow() %>%
  # Add the recipe
  add_recipe(rec) %>%
  # Add the model
  add_model(rf_model) %>%
  # Fit the model
  fit(data = camels_train) 
rf_data <- augment(rf_wf, new_data = camels_test)
dim(rf_data)
[1] 135  60
metrics(rf_data, truth = logQmean, estimate = .pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.587
2 rsq     standard       0.741
3 mae     standard       0.363
ggplot(rf_data, aes(x = logQmean, y = .pred, colour = aridity)) +
  scale_color_viridis_c() +
  geom_point() +
  geom_abline() +
  theme_linedraw()

wf <- workflow_set(list(rec), list(lm_model, rf_model)) %>%
  workflow_map('fit_resamples', resamples = camels_cv) 

autoplot(wf)

rank_results(wf, rank_metric = "rsq", select_best = TRUE)
# A tibble: 4 × 9
  wflow_id          .config .metric  mean std_err     n preprocessor model  rank
  <chr>             <chr>   <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
1 recipe_rand_fore… Prepro… rmse    0.563  0.0247    10 recipe       rand…     1
2 recipe_rand_fore… Prepro… rsq     0.771  0.0259    10 recipe       rand…     1
3 recipe_linear_reg Prepro… rmse    0.569  0.0260    10 recipe       line…     2
4 recipe_linear_reg Prepro… rsq     0.770  0.0223    10 recipe       line…     2

Question 3

xgb_model <- boost_tree(mode = "regression") %>%
  set_engine("xgboost")
nnet_model <- bag_mlp(mode = "regression") %>%
  set_engine("nnet")
xgb_wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(xgb_model) %>%
  fit(data = camels_train)

nnet_wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(nnet_model) %>%
  fit(data = camels_train)
xgb_preds <- predict(xgb_wf, camels_test) %>%
  bind_cols(camels_test)

nnet_preds <- predict(nnet_wf, camels_test) %>%
  bind_cols(camels_test)

metrics_xgb <- xgb_preds %>%
  metrics(truth = aridity, estimate = .pred)

metrics_nnet <- nnet_preds %>%
  metrics(truth = aridity, estimate = .pred)
print(metrics_xgb)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       2.00 
2 rsq     standard       0.757
3 mae     standard       1.46 
wf <- workflow_set(
  preproc = list(rec),  # your recipe
  models = list(
    linear = lm_model,
    random_forest = rf_model,
    xgboost = xgb_model,
    bagged_nnet = nnet_model
  )
) %>%
  workflow_map("fit_resamples", resamples = camels_cv)
rank_results(wf, rank_metric = "rsq", select_best = TRUE)
# A tibble: 8 × 9
  wflow_id          .config .metric  mean std_err     n preprocessor model  rank
  <chr>             <chr>   <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
1 recipe_bagged_nn… Prepro… rmse    0.547  0.0282    10 recipe       bag_…     1
2 recipe_bagged_nn… Prepro… rsq     0.786  0.0234    10 recipe       bag_…     1
3 recipe_random_fo… Prepro… rmse    0.564  0.0260    10 recipe       rand…     2
4 recipe_random_fo… Prepro… rsq     0.770  0.0266    10 recipe       rand…     2
5 recipe_linear     Prepro… rmse    0.569  0.0260    10 recipe       line…     3
6 recipe_linear     Prepro… rsq     0.770  0.0223    10 recipe       line…     3
7 recipe_xgboost    Prepro… rmse    0.600  0.0289    10 recipe       boos…     4
8 recipe_xgboost    Prepro… rsq     0.745  0.0268    10 recipe       boos…     4
# Based off of these results, I would go with the neural network model since it outperformed the other models both in rmse and rsq. 

Question 4

camels |> 
  select(q_mean, runoff_ratio, low_prec_freq) |> 
  drop_na() |> 
  cor()
                  q_mean runoff_ratio low_prec_freq
q_mean         1.0000000    0.8755618    -0.7145711
runoff_ratio   0.8755618    1.0000000    -0.7259121
low_prec_freq -0.7145711   -0.7259121     1.0000000
camels_clean <- camels %>%
  filter(!is.na(runoff_ratio) & !is.na(low_prec_freq))
camels_clean <- camels_clean %>%
  mutate(runoff_ratio_log = log(runoff_ratio + 1))

Data Splitting

set.seed(142)

camels_split3 <- initial_split(camels_clean, prop = 0.75)
camels_train3 <- training(camels_split3)
camels_test3 <- testing(camels_split3)

camels_cv3 <- vfold_cv(camels_train3, v = 10)

Recipe

rec3 <- recipe(logQmean ~ runoff_ratio_log + low_prec_freq, data = camels_train3) %>%
  step_interact(terms = ~ runoff_ratio_log:low_prec_freq) %>%
  step_naomit(all_predictors(), all_outcomes())
# I wanted to use low precipitation frequency and runoff ratio because I felt like they would both have a good relationship with our streamflow mean. Runoff ratio was skewed to the right so I added a log transform but I kept low_prec_freq as is because the distribution looked relatively normal. 

Define 3 Models

randf_model <- rand_forest(mtry = 2, trees = 500) %>%
  set_engine("ranger") %>%
  set_mode("regression")
linm_model <- linear_reg() %>%
  set_engine("lm") %>%
  set_mode("regression")
svm_model <- svm_rbf() %>%
  set_engine("kernlab") %>%
  set_mode("regression")

Workflow Set

wf_set <- workflow_set(
  preproc = list(rec3), 
  models = list(randf_model, linm_model, svm_model) 
)

wf_results <- wf_set %>%
  workflow_map("fit_resamples", resamples = camels_cv3)

Evaluation

autoplot(wf_results)

ranked_results <- rank_results(wf_results)
ranked_results
# A tibble: 6 × 9
  wflow_id          .config .metric  mean std_err     n preprocessor model  rank
  <chr>             <chr>   <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
1 recipe_rand_fore… Prepro… rmse    0.264 0.00951    10 recipe       rand…     1
2 recipe_rand_fore… Prepro… rsq     0.952 0.00446    10 recipe       rand…     1
3 recipe_svm_rbf    Prepro… rmse    0.303 0.0196     10 recipe       svm_…     2
4 recipe_svm_rbf    Prepro… rsq     0.938 0.00663    10 recipe       svm_…     2
5 recipe_linear_reg Prepro… rmse    0.426 0.0167     10 recipe       line…     3
6 recipe_linear_reg Prepro… rsq     0.878 0.00805    10 recipe       line…     3
# It looks like random forest works the best because it performs the best in the department of rmse and rsq. 

Extract and Evaluate

rf_workflow <- workflow() %>%
  add_recipe(rec3) %>%
  add_model(randf_model)

rf_fit <- rf_workflow %>%
  fit(data = camels_train3)

rf_predictions <- rf_fit %>%
  augment(new_data = camels_test3)

ggplot(rf_predictions, aes(x = logQmean, y = .pred)) +
  geom_point(aes(color = abs(logQmean - .pred)), size = 3, alpha = 0.7) +
  scale_color_viridis_c() +
  labs(
    title = "Observed vs Predicted LogQmean (Random Forest Model)",
    x = "Observed LogQmean",
    y = "Predicted LogQmean",
    color = "Absolute Error"
  ) +
  theme_minimal()

# I believe the random forest model is extraordinarily accurate based off of this graph. The points are in a relative straight line and there's not a lot of spread between them.